#!/usr/bin/python3

import os
import re
import sys

verbose = '--verbose' in sys.argv

default_copyright = 'The Qt Company Ltd.'
default_licenses = (
    'BSD-3-clause',
    'LGPL-3 or GPL-2',
    'GPL-3 with Qt-1.0 exception',
    'GFDL-NIV-1.3',
)
header_re = re.compile(r"SPDX-License-Identifier: (.*)")
copyright_re = re.compile(
    r"(?:Copyright|SPDX-FileCopyrightText):? "  # "Copyright" word and optional colon
    r"(?:\([Cc]\) )?"  # Optional (C) symbol
    r"(?P<min_year>\d{4})"  # Single year or beginning of range
    r"(?: ?-? ?(?P<max_year>\d{4}))? "  # Optional end of range
    r"(?P<author>[^0-9 ].+)"  # Author (should not start with digit or space)
)

author_map = {
    re.compile('^BlackBerry|^Research [Ii]n Motion'): 'BlackBerry Limited (formerly Research In Motion)',
    re.compile('^BogDan Vatra'): 'BogDan Vatra <bogdan@kde.org>',
    re.compile('^Canonical'): 'Canonical, Ltd.',
    re.compile('^David Faure'): 'David Faure <faure@kde.org>',
    re.compile('^Giuseppe D\'Angelo'): 'Giuseppe D\'Angelo <dangelog@gmail.com>',
    re.compile('^Governikus GmbH & Co. KG'): 'Governikus GmbH & Co. KG.',
    re.compile('^Green Hills Software'): 'Green Hills Software',
    re.compile('^Intel Corporation'): 'Intel Corporation',
    re.compile('^Ivan Komissarov'): 'Ivan Komissarov <ABBAPOH@gmail.com>',
    re.compile('KDAB'): 'Klarälvdalens Datakonsult AB, a KDAB Group company',
    re.compile('^Konstantin Ritt'): 'Konstantin Ritt <ritt.ks@gmail.com>',
    re.compile('^Lorn Potter'): 'Lorn Potter',
    re.compile(r'Martsum .*tmartsum\[at\]gmail.com'): 'Thorbjørn Lund Martsum <tmartsum@gmail.com>',
    re.compile('^Olivier Goffart'): 'Olivier Goffart <ogoffart@woboq.com>',
    re.compile('^Richard J. Moore'): 'Richard J. Moore <rich@kde.org>',
    re.compile('^Robin Burchell'): 'Robin Burchell <robin.burchell@viroteck.net>',
    re.compile('^Samuel Gaist'): 'Samuel Gaist <samuel.gaist@edeltech.ch>',
    re.compile('^Stephen Kelly'): 'Stephen Kelly <steveire@gmail.com>',
    re.compile('^The Qt Company'): 'The Qt Company Ltd.',
}

licenses_map = {
    'BSD-3-Clause': 'BSD-3-clause',
    'GFDL-1.3-no-invariants-only': 'GFDL-NIV-1.3',
    'GPL-2.0-only': 'GPL-2',
    'GPL-3.0-only': 'GPL-3',
    'GPL-3.0-only WITH Qt-GPL-exception-1.0': 'GPL-3 with Qt-1.0 exception',
    'LGPL-3.0-only': 'LGPL-3',
    'MIT': 'Expat',
}

exclude_prefixes = (
    'header',
    '.git',
)

start_header = '## BEGIN AUTO GENERATED BLOCK'
end_header = '## END AUTO GENERATED BLOCK'


class CopyrightInfo():
    def __init__(self):
        self.min_years = {}
        self.max_years = {}
        self.files = []

    def add_file(self, authors, file):
        for min_year, max_year, author in authors:
            if author in self.min_years:
                self.min_years[author] = min(self.min_years[author], min_year)
            else:
                self.min_years[author] = min_year
            if author in self.max_years:
                self.max_years[author] = max(self.max_years[author], max_year)
            else:
                self.max_years[author] = max_year
        self.files.append(file)

    def get_strings(self, authors):
        for author in authors:
            min_year = self.min_years[author]
            max_year = self.max_years[author]
            if min_year == max_year:
                yield '%d %s' % (min_year, author)
            else:
                yield '%d-%d %s' % (min_year, max_year, author)


def canonicalize_author_name(author):
    for regex, replacement in author_map.items():
        if regex.search(author):
            return replacement
    return author


def parse_file(filename):
    license = None
    authors = []
    with open(filename) as file:
        try:
            data = file.readlines(500)
        except UnicodeDecodeError:
            data = []
            authors = None
    for line in data:
        match = copyright_re.search(line)
        if match:
            author = match.group("author")
            min_year = int(match.group("min_year"))
            max_year = int(match.group("max_year") or min_year)
            author = canonicalize_author_name(author)
            authors.append((min_year, max_year, author))
        match = header_re.search(line)
        if match:
            licenses = match.group(1).split(' OR ')
            licenses = [licenses_map.get(license, license) for license in licenses]
            if 'LGPL-3' in licenses and 'GPL-3' in licenses:
                # LGPL-3 assumes GPL-3, no need to mention it explicitly
                licenses.remove('GPL-3')
            if 'LicenseRef-Qt-Commercial' in licenses:
                # Also no need to mention commercial license
                licenses.remove('LicenseRef-Qt-Commercial')
            license = ' or '.join(licenses)
    if license and not authors:
        print(f'{filename} ({license}): No authors!', file=sys.stderr)
    elif verbose:
        if authors is None:
            print(f'{filename} (binary)')
        elif license is None:
            print(f'{filename} (unknown)')
        else:
            print(f'{filename} ({license})')
    return license, authors


def get_source_files(root_directory):
    for dirpath, dirnames, filenames in os.walk(root_directory):
        for filename in filenames:
            full_path = os.path.join(dirpath, filename)
            if full_path.startswith('./'):
                full_path = full_path[2:]
            if any(full_path.startswith(prefix) for prefix in exclude_prefixes):
                continue
            yield full_path


def format_list(title, strings):
    return title + ('\n' + ' ' * len(title)).join(strings)


def main(root_directory):
    with open('debian/copyright') as copyright_file:
        current_copyright = copyright_file.read()
        start_pos = current_copyright.find(start_header) + len(start_header) + 1
        start_data = current_copyright[:start_pos]
        end_pos = current_copyright.find(end_header) - 1
        end_data = current_copyright[end_pos:]

    data = read_input(root_directory)

    with open('debian/copyright', 'w') as output_file:
        output_file.write(start_data)
        write_output(data, output_file)
        output_file.write(end_data)


def read_input(root_directory):
    data = {}

    for filename in get_source_files(root_directory):
        license, authors = parse_file(filename)
        if license is None:
            continue
        if license not in data:
            data[license] = {}
        license_dict = data[license]
        authors_tuple = tuple(sorted({author[2] for author in authors}))
        if authors_tuple not in license_dict:
            license_dict[authors_tuple] = CopyrightInfo()
        license_dict[authors_tuple].add_file(authors, filename)
    return data


def write_output(data, output_file):
    for license in sorted(data.keys()):
        output_file.write('\n## ' + license + '\n')
        license_dict = data[license]
        for authors in sorted(license_dict.keys()):
            if authors == (default_copyright,) and license in default_licenses:
                continue
            copyright_info = license_dict[authors]
            output_file.write('\n')
            output_file.write(format_list('Files: ', sorted(copyright_info.files)) + '\n')
            output_file.write(format_list('Copyright: ', copyright_info.get_strings(authors)) + '\n')
            output_file.write('License: ' + license + '\n')


if __name__ == '__main__':
    main('.')
